{
 "metadata": {
  "name": ""
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import scrapy\n",
      "import requests\n",
      "\n",
      "def getResponse(url, encoding='utf8'):\n",
      "    req = requests.get(url)\n",
      "    res = scrapy.http.TextResponse(url=url, body=req.text, encoding=encoding)\n",
      "    return res\n"
     ],
     "language": "python",
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\u767e\u5ea6\u4e00\u4e0b\uff0c\u4f60\u5c31\u77e5\u9053\n"
       ]
      }
     ],
     "prompt_number": 103
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import scrapy\n",
      "from scrapy.contrib.spiders import CrawlSpider\n",
      "from scrapy.contrib.linkextractors import LinkExtractor\n",
      "\n",
      "class Product(scrapy.Item):\n",
      "    name = scrapy.Field()\n",
      "    price = scrapy.Field()\n",
      "    stock = scrapy.Field()\n",
      "    last_updated = scrapy.Field(serializer=str)\n",
      "\n",
      "product = Product(name=\"Desktop PC\", price=1000)\n",
      "product.items()\n",
      "    "
     ],
     "language": "python",
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 15,
       "text": [
        "[('price', 1000), ('name', 'Desktop PC')]"
       ]
      }
     ],
     "prompt_number": 15
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from scrapy.selector import Selector\n",
      "from scrapy.http import HtmlResponse\n",
      "\n",
      "body = '<html><body><span>\u4f60\u597d</span></body></html>'\n",
      "print '1.', Selector(text=body).xpath('//span/text()').extract()\n",
      "\n",
      "response = HtmlResponse(url='http://www.baidu.com', body=body)\n",
      "print '2.', response.selector.xpath('//span/text()').extract()"
     ],
     "language": "python",
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "1. [u'\\u4f60\\u597d']\n",
        "2. [u'\\u4f60\\u597d']\n"
       ]
      }
     ],
     "prompt_number": 76
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "response = getResponse('http://doc.scrapy.org/en/latest/_static/selectors-sample1.html')\n"
     ],
     "language": "python",
     "outputs": [],
     "prompt_number": 107
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print response.xpath('//title/text()')\n",
      "print response.css('title::text')\n",
      "\n",
      "print response.xpath('//title/text()').extract()\n",
      "print response.css('title::text').extract()\n"
     ],
     "language": "python",
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "[<Selector xpath='//title/text()' data=u'Example website'>]\n",
        "[<Selector xpath=u'descendant-or-self::title/text()' data=u'Example website'>]\n",
        "[u'Example website']\n",
        "[u'Example website']\n"
       ]
      }
     ],
     "prompt_number": 119
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "\"\"\"\n",
      "<html>\n",
      " <head>\n",
      "  <base href='http://example.com/' />\n",
      "  <title>Example website</title>\n",
      " </head>\n",
      " <body>\n",
      "  <div id='images'>\n",
      "   <a href='image1.html'>Name: My image 1 <br /><img src='image1_thumb.jpg' /></a>\n",
      "   <a href='image2.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a>\n",
      "   <a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a>\n",
      "   <a href='image4.html'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a>\n",
      "   <a href='image5.html'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a>\n",
      "  </div>\n",
      " </body>\n",
      "</html>\n",
      "\"\"\"\n",
      "\n",
      "print response.xpath('//base/@href').extract()\n",
      "print response.css('base::attr(href)').extract()\n",
      "print response.xpath('//a/@href').extract()\n",
      "print response.xpath('//a/img/@src').extract()\n"
     ],
     "language": "python",
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "[u'http://example.com/']\n",
        "[u'http://example.com/']\n",
        "[u'image1.html', u'image2.html', u'image3.html', u'image4.html', u'image5.html']\n",
        "[u'image1_thumb.jpg', u'image2_thumb.jpg', u'image3_thumb.jpg', u'image4_thumb.jpg', u'image5_thumb.jpg']\n"
       ]
      }
     ],
     "prompt_number": 137
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "links = response.xpath('//a')\n",
      "links.extract()\n",
      "\n",
      "for index, link in enumerate(links):\n",
      "    args = (index, link.xpath('@href').extract(), link.xpath('img/@src').extract())\n",
      "    print 'Link number %d points to url %s and image %s' % args"
     ],
     "language": "python",
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Link number 0 points to url [u'image1.html'] and image [u'image1_thumb.jpg']\n",
        "Link number 1 points to url [u'image2.html'] and image [u'image2_thumb.jpg']\n",
        "Link number 2 points to url [u'image3.html'] and image [u'image3_thumb.jpg']\n",
        "Link number 3 points to url [u'image4.html'] and image [u'image4_thumb.jpg']\n",
        "Link number 4 points to url [u'image5.html'] and image [u'image5_thumb.jpg']\n"
       ]
      }
     ],
     "prompt_number": 141
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from scrapy import Selector\n",
      "sel = Selector(text='<a href=\"#\">Click here to go to the <strong>Next Page</strong></a>')\n",
      "\n",
      "print sel.xpath('//a//text()').extract()\n",
      "print '===='\n",
      "print sel.xpath('string(//a[1])').extract()"
     ],
     "language": "python",
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "[u'Click here to go to the ', u'Next Page']\n",
        "====\n",
        "[u'Click here to go to the Next Page']\n"
       ]
      }
     ],
     "prompt_number": 180
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def testy():\n",
      "    for i in range(10):\n",
      "        yield i\n",
      "\n",
      "    yield 12\n"
     ],
     "language": "python",
     "outputs": [
      {
       "ename": "SyntaxError",
       "evalue": "invalid syntax (<ipython-input-183-ad377157063e>, line 3)",
       "output_type": "pyerr",
       "traceback": [
        "\u001b[0;36m  File \u001b[0;32m\"<ipython-input-183-ad377157063e>\"\u001b[0;36m, line \u001b[0;32m3\u001b[0m\n\u001b[0;31m    yeild i\u001b[0m\n\u001b[0m          ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n"
       ]
      }
     ],
     "prompt_number": 183
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "a = testy()"
     ],
     "language": "python",
     "outputs": [
      {
       "ename": "NameError",
       "evalue": "name 'testy' is not defined",
       "output_type": "pyerr",
       "traceback": [
        "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
        "\u001b[0;32m<ipython-input-181-a88bdaad2194>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0ma\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtesty\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
        "\u001b[0;31mNameError\u001b[0m: name 'testy' is not defined"
       ]
      }
     ],
     "prompt_number": 181
    }
   ]
  }
 ]
}